skimr::skim(ny_noaa)
## Skim summary statistics
##  n obs: 2595176 
##  n variables: 7 
## 
## ── Variable type:character ────────────────────────────────────────────────
##  variable missing complete       n min max empty n_unique
##        id       0  2595176 2595176  11  11     0      747
##      tmax 1134358  1460818 2595176   1   4     0      532
##      tmin 1134420  1460756 2595176   1   4     0      548
## 
## ── Variable type:Date ─────────────────────────────────────────────────────
##  variable missing complete       n        min        max     median
##      date       0  2595176 2595176 1981-01-01 2010-12-31 1997-01-21
##  n_unique
##     10957
## 
## ── Variable type:integer ──────────────────────────────────────────────────
##  variable missing complete       n  mean     sd  p0 p25 p50 p75  p100
##      prcp  145838  2449338 2595176 29.82  78.18   0   0   0  23 22860
##      snow  381221  2213955 2595176  4.99  27.22 -13   0   0   0 10160
##      snwd  591786  2003390 2595176 37.31 113.54   0   0   0   0  9195
##      hist
##  ▇▁▁▁▁▁▁▁
##  ▇▁▁▁▁▁▁▁
##  ▇▁▁▁▁▁▁▁

The ny noaa data were accessed from the NOAA National Climatic Data Center, consisting of 2595176 rows and 7. Key variables include date by date, prcipitation by prcp, snowfall by snow, depth of snow by snwd, highest temparture by tmax, and lowest temparture by tmin. For tmax and tmin, 0.4371025 of the data is missing, which is a big problem if we are trying to calculate temparature related results because we cannot be sure whether the missing data would lead to a different result.

data cleaning

ny_noaa_cleaned = separate(ny_noaa, date, into = c("year", "month", "day"), sep = "-") %>%
  mutate(tmax = as.integer(tmax)) %>%
  mutate(tmax = tmax/10) %>%
  mutate(tmin = as.integer(tmin)) %>%
  mutate(tmin = tmin/10) %>%
  mutate(prcp = prcp/10) %>%
  sample_n(5000)
skimr::skim(ny_noaa_cleaned)
## Skim summary statistics
##  n obs: 5000 
##  n variables: 9 
## 
## ── Variable type:character ────────────────────────────────────────────────
##  variable missing complete    n min max empty n_unique
##       day       0     5000 5000   2   2     0       31
##        id       0     5000 5000  11  11     0      587
##     month       0     5000 5000   2   2     0       12
##      year       0     5000 5000   4   4     0       30
## 
## ── Variable type:integer ──────────────────────────────────────────────────
##  variable missing complete    n  mean     sd p0 p25 p50 p75 p100     hist
##      snow     713     4287 5000  5.56  27.33  0   0   0   0  406 ▇▁▁▁▁▁▁▁
##      snwd    1131     3869 5000 35.37 109.84  0   0   0   0 1067 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ──────────────────────────────────────────────────
##  variable missing complete    n  mean    sd    p0  p25  p50  p75  p100
##      prcp     260     4740 5000  3.08  8.57   0    0    0    2.3 274.3
##      tmax    2218     2782 5000 13.99 11.3  -22.2  4.4 15   23.9  36.7
##      tmin    2221     2779 5000  3.05 10.56 -37.2 -3.9  3.3 11.7  25  
##      hist
##  ▇▁▁▁▁▁▁▁
##  ▁▁▃▆▆▆▇▂
##  ▁▁▂▃▇▇▇▂

NYC average minimum temperature in Jan and July

Column

Chart A

tmax_jan_july = select(ny_noaa_cleaned, id, year, month, tmax, tmin, prcp) %>%
  filter(month == "01" | month == "07") %>%
  mutate(month = factor(month, labels = c("Jan", "July"))) %>%
  group_by(id, year, month) %>% 
  summarize(mean_tmax = mean(tmax, na.rm = TRUE),
            mean_tmin = mean(tmin, na.rm = TRUE),
            mean_prcp = mean(prcp, na.rm = TRUE))
tmax_jan_july %>%
  mutate(text_label = str_c('Year: ', year, ' MaxTemp: ', mean_tmax, ' C')) %>%
  plot_ly(x = ~year, y = ~mean_tmax, type = "scatter", mode = "markers",
          alpha = 0.5,
          color = ~month,
          text = ~text_label)
## Warning: Ignoring 312 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Column

Chart B

ny_noaa_cleaned %>%
  filter(year == 2000 | year == 2001|year == 2002| year == 2003| year == 2004| year == 2005| year == 2006| year == 2007| year == 2008| year == 2009| year == 2010) %>%
  filter(prcp < 10) %>%
  plot_ly(y = ~prcp, color = ~year, type = "box",
          colors = "Set2")

Column

Chart C

ny_noaa_cleaned %>% 
  count(id) %>% 
  mutate(id = fct_reorder(id, n)) %>% 
  plot_ly(x = ~id, y = ~n, color = ~id, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
nyc_airbnb %>% 
  count(neighbourhood) %>% 
  mutate(neighbourhood = fct_reorder(neighbourhood, n)) %>% 
  plot_ly(x = ~neighbourhood, y = ~n, color = ~neighbourhood, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors